In this part, we will visualize the pathways from Over-representation Analysis using heatmap.


Prerequisites - DESeq2 analysis, extraction of DEG list, and pathway analysis

1. Install and load the necessary libraries
knitr::opts_chunk$set(
 warning = FALSE, message = FALSE
)
#install.packages("BiocManager")
#library(BiocManager)
#BiocManager::install("DESeq2")
#BiocManager::install("apeglm")
#BiocManager::install("tidyverse") # includes ggplot2 and dplyr
#BiocManager::install("EnhancedVolcano")
#install.packages("plotly")
#BiocManager::install("ComplexHeatmap") #alternative = pheatmap
#BiocManager::install( "AnnotationDbi" )
#install.packages("devtools")
#install.packages("Rtools")
#BiocManager::install("EnsDb.Hsapiens.v86")
# BiocManager::install( "clusterProfiler" )
# BiocManager::install( "enrichplot" )
# BiocManager::install("DOSE")
#BiocManager::install("ReactomePA", force = TRUE)
#BiocManager::install("msigdbr", force = TRUE)
#BiocManager::install("KEGGREST", force = TRUE)
# BiocManager::install("BiocUpgrade") ## you may need this
# BiocManager::install("GOSemSim", force = TRUE)
# install.packages("remotes")
# library(devtools)
# devtools::install_github("GuangchuangYu/GOSemSim")
#install.packages(c("ggraph", "ggnetwork")) #altering ggplots 
#devtools::install_github("datapplab/pathview")
#update.packages(c("lattice", "spatial"))
#install.packages("igraph")
#library(DESeq2)
library(AnnotationDbi)
library(devtools)
library(tidyverse) # includes ggplot2, for data visualisation. dplyr, for data manipulation.
library(clusterProfiler) # for PEA analysis
library(org.Hs.eg.db)
library(EnsDb.Hsapiens.v86)
library(DOSE)
library(enrichplot) # for visualisations
library(ggupset) # for visualisations
library(ReactomePA)
library(msigdbr)
library(KEGGREST)
library(httr2)
library(DBI)
library(GOSemSim)
library(lattice)
library(spatial)
library(pathview)
library(cowplot)
library(ggridges)
library(ComplexHeatmap)
library(circlize)
library(igraph)

#Set directory, input and output path
getwd()
## [1] "/home/sant/Downloads/Data Science-bioinformatics learning/Bulk-RNA-seq-analysis-DESeq2/Part 2- Pathway analysis"
in_path <- paste0(getwd(),"/Input/") # input path, where your input data is located
GO_BP <- paste0(getwd(),"/ORA/GO/") # Path for counts table
out_heat_GO <- paste0(getwd(),"/ORA-heatmap/GO/") # output path for heatmap results

####2a-Convert data to gene symbols and create heatmap with gene symbols

#load counts table, Ensgene names as rownames
norm_counts <- read.csv(file= paste0(in_path,"norm_counts.csv")) %>% column_to_rownames(var = "X")

#import the top10 combined pathways 
BP_comb_top10 <- read.csv(file = paste0(GO_BP,"BP_top10_correctfinal.csv"))

#create functions to extract data from the enrichGO object, create a list of ensembl genes, and 

slice_head(BP_comb_top10, n=20)
#Extract ORA list from results data frame
extract_ORA <- function(x) { #Function to extract geneID and description from enrichGO as a list
  y <- x$geneID #extract geneIds from enrichGO results
  z <- x$Description #extract Description from enrichGO results
  return(list(geneIDs = y, Description = z))
}

#Split gene names from 
Split_ORA_l <- function(y,z){# Function to separate the gene names in the list, input geneList and Description
  x <- lapply(seq_along(y), function(i){ # y is the geneList from 1st step
    unlist(strsplit(y[i], "/")) #split and return the geneIds as a vector
  })
    names(x) <- z #name of each pathway/list
    return(x) #return the function as a list
}
#Convert ORA list from ENTREZ to gene symbols
Sym_ORA <- function(x) { #list of split genes
  y <- lapply(x, function(i){ # create a list
    z <- mapIds(org.Hs.eg.db, keys = i, keytype = "ENTREZID", column = "SYMBOL") #Use annotationDbi to return the ENSEMBL Ids from Split list
    return(z[!is.na(z)])
  })
  return(y)
}

#Annotate Counts table with gene symbol, remove rownames then convert symbol col to rownames
Counts_S <- norm_counts
Counts_S$symbol <- mapIds(org.Hs.eg.db, keys = rownames(norm_counts), keytype = "ENSEMBL", column = "SYMBOL")

rownames(Counts_S) <- NULL 

Counts_S <- Counts_S[!duplicated(Counts_S$symbol),] %>% na.omit(Counts_S) 

rownames(Counts_S) <- NULL 

Counts_S <- Counts_S %>% column_to_rownames(var = "symbol")

slice_head(Counts_S, n=20)
#Create gene list for heatmap for each pathway by matching ORA list with counts table
ORA_hm_l <- function(x,y) {#x is the ens enrichGO list(from previous step) and y is the counts list
  z <- lapply(x, function(i){
    k <- y[rownames(y) %in% i,] #k is the list of common genes between the counts table and ens list
    return(k)
  })
  return(z) # z is the whole list
}

#Heatmap function using gene symbols
ORA_hm_S <- function(x, top_n = 25){
  k <- lapply(seq_along(x), function(i){ #whole heatmap function
   # print(nrow(x[[i]]))
    if (nrow(x[[i]]) > 0) { #Make sure there are genes in each list
      # print(sapply(x[[i]], is.numeric))
       # Subset to top 20 genes
      if (nrow(x[[i]]) > top_n) {
        w <- x[[i]][1:top_n, ] # Select top 20 rows
      } else {
        w <- x[[i]] # If less than 20 rows, use the whole dataframe
      }

      y <- w[, sapply(w, is.numeric)]
      z <- scale(y)
      # print("Heatmap is being called")
      hm <- Heatmap(z, name = "z-score", #heatmap settings and input
                    row_labels = rownames(w),
                    column_labels = colnames(y),
                    column_title = names(x)[i],
                    column_title_gp = gpar(fontsize = 16), #Bigger title
                    cluster_rows = T, cluster_columns = T,
                    col = colorRamp2(c(min(z), 0, max(z)), c("blue", "white", "red")))      # Save heatmap as PNG
        pdf(file = paste0(out_heat_GO, "Symbol_", names(x)[i], ".pdf")) # Adjust width and height as needed
        draw(hm)
        dev.off()
    return(hm)
    } else { #make sure there are genes in the list or no problems, otherwise it will have no hm and thus return NULL
      return(NULL)
      }
      })
 print(k)
     }
  
 
#Perform all data extraction and heatmap functions
ORA1 <- extract_ORA(BP_comb_top10) #Extract the data from the combined top 10 pathways
ORA2 <- Split_ORA_l(ORA1$geneIDs, ORA1$Description) # Split each gene into a separate character
ORA3a <- Sym_ORA(ORA2) # Convert gene names to symbol
ORA4a <- ORA_hm_l(ORA3a, Counts_S) # Create the gene list with counts using the genes present in normalized counts table
ORA5a <- ORA_hm_S(ORA4a) # Visualize the heatmap
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

## 
## [[15]]

## 
## [[16]]

## 
## [[17]]

## 
## [[18]]

## 
## [[19]]

## 
## [[20]]

print(ORA5a) #Visualize the heatmap
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

## 
## [[15]]

## 
## [[16]]

## 
## [[17]]

## 
## [[18]]

## 
## [[19]]

## 
## [[20]]

####2b-Heatmap with ensembl IDs

# #load counts table, Ensgene names as rownames
# norm_counts <- read.csv(file= paste0(in_path,"norm_counts.csv")) %>% column_to_rownames(var = "X")
# 
# #import the top10 combined pathways 
# BP_comb_top10 <- read.csv(file = paste0(GO_BP,"BP_top10_correctfinal.csv"))
# 
# #create functions to extract data from the enrichGO object, create a list of ensembl genes, and 
# 
# slice_head(BP_comb_top10, n=10)
# 
# #Extract ORA list from results data frame
# extract_ORA <- function(x) { #Function to extract geneID and description from enrichGO as a list
#   y <- x$geneID #extract geneIds from enrichGO results
#   z <- x$Description #extract Description from enrichGO results
#   return(list(geneIDs = y, Description = z))
# }
# 
# #Split gene names from 
# Split_ORA_l <- function(y,z){# Function to separate the gene names in the list, input geneList and Description
#   x <- lapply(seq_along(y), function(i){ # y is the geneList from 1st step
#     unlist(strsplit(y[i], "/")) #split and return the geneIds as a vector
#   })
#     names(x) <- z #name of each pathway/list
#     return(x) #return the function as a list
# }
# 
# #Convert ORA list from ENTREZ to ENSEMBL ID
# Ens_ORA <- function(x) { #list of split genes
#   y <- lapply(x, function(i){ # create a list
#     z <- mapIds(org.Hs.eg.db, keys = i, keytype = "ENTREZID", column = "ENSEMBL") #Use annotationDbi to return the ENSEMBL Ids from Split list
#     return(z[!is.na(z)])
#   })
#   return(y)
# }
# 
# #Create gene list for heatmap for each pathway by matching ORA list with counts table
# ORA_hm_l <- function(x,y) {#x is the ens enrichGO list(from previous step) and y is the counts list
#   z <- lapply(x, function(i){
#     k <- y[rownames(y) %in% i,] #k is the list of common genes between the counts table and ens list
#     return(k)
#   })
#   return(z) # z is the whole list
# }
# 
# #Function to create heatmap for all pathways.. Loop through each pathway and save as a new file
# ORA_hm <- function(x, top_n = 20){
#    k <- lapply(seq_along(x), function(i){ #whole heatmap function
#     if (nrow(x[[i]]) > 0) { #Make sure there are genes in each list
#       print(sapply(x[[i]], is.numeric))
#       print(nrow(x[[i]]))
#        # Subset to top 20 genes
#       if (nrow(x[[i]]) > top_n) {
#         w <- x[[i]][1:top_n, ] # Select top 20 rows
#       } else {
#         w <- x[[i]] # If less than 20 rows, use the whole dataframe
#       }
# 
#       y <- w[, sapply(w, is.numeric)]
#       z <- scale(y)
#       print("Heatmap is being called")
#       hm <- Heatmap(z, name = "z-score", #heatmap settings and input
#                     row_labels = rownames(w),
#                     column_labels = colnames(y),
#                     column_title = names(x)[i],
#                     column_title_gp = gpar(fontsize = 14), #Bigger title
#                     cluster_rows = T, cluster_columns = T,
#                     col = colorRamp2(c(min(z), 0, max(z)), c("blue", "white", "red")))      # Save heatmap as PNG
#         pdf(file = paste0(out_heat_GO, paste0(names(x)[i], ".pdf"))) # Save each file as a pathway name
#         draw(hm)
#         dev.off()
#     return(hm)
#     } else { #make sure there are genes in the list or no problems, otherwise it will have no hm and thus return NULL
#       return(NULL)
#       }
#       })
#  print(k)
#      }
#   
#   
# #Perform data extraction and heatmap
# ORA1 <- extract_ORA(BP_comb_top10) #Extract the data from the combined top 10 pathways
# ORA2 <- Split_ORA_l(ORA1$geneIDs, ORA1$Description) # Split each gene into a separate character
# ORA3 <- Ens_ORA(ORA2) # Convert gene names to ensembl ID
# ORA4 <- ORA_hm_l(ORA3, norm_counts) # Create the gene list with counts using the genes present in normalized counts table
# ORA5 <- ORA_hm(ORA4) # Visualize the heatmap
sessionInfo() 
## R version 4.4.3 (2025-02-28)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.2 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.12.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: America/Chicago
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] grid      stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] igraph_2.1.4              circlize_0.4.16          
##  [3] ComplexHeatmap_2.22.0     ggridges_0.5.6           
##  [5] cowplot_1.1.3             pathview_1.43.1          
##  [7] spatial_7.3-17            lattice_0.22-5           
##  [9] GOSemSim_2.33.0           DBI_1.2.3                
## [11] httr2_1.1.0               KEGGREST_1.46.0          
## [13] msigdbr_7.5.1             ReactomePA_1.50.0        
## [15] ggupset_0.4.1             enrichplot_1.26.6        
## [17] DOSE_4.0.0                EnsDb.Hsapiens.v86_2.99.0
## [19] ensembldb_2.30.0          AnnotationFilter_1.30.0  
## [21] GenomicFeatures_1.58.0    GenomicRanges_1.58.0     
## [23] GenomeInfoDb_1.42.3       org.Hs.eg.db_3.20.0      
## [25] clusterProfiler_4.14.4    lubridate_1.9.4          
## [27] forcats_1.0.0             stringr_1.5.1            
## [29] dplyr_1.1.4               purrr_1.0.4              
## [31] readr_2.1.5               tidyr_1.3.1              
## [33] tibble_3.2.1              ggplot2_3.5.1            
## [35] tidyverse_2.0.0           devtools_2.4.5           
## [37] usethis_3.1.0             AnnotationDbi_1.68.0     
## [39] IRanges_2.40.1            S4Vectors_0.44.0         
## [41] Biobase_2.66.0            BiocGenerics_0.52.0      
## 
## loaded via a namespace (and not attached):
##   [1] splines_4.4.3               later_1.4.1                
##   [3] BiocIO_1.16.0               bitops_1.0-9               
##   [5] ggplotify_0.1.2             R.oo_1.27.0                
##   [7] polyclip_1.10-7             graph_1.84.1               
##   [9] XML_3.99-0.18               lifecycle_1.0.4            
##  [11] doParallel_1.0.17           MASS_7.3-64                
##  [13] magrittr_2.0.3              sass_0.4.9                 
##  [15] rmarkdown_2.29              jquerylib_0.1.4            
##  [17] yaml_2.3.10                 remotes_2.5.0              
##  [19] httpuv_1.6.15               ggtangle_0.0.6             
##  [21] sessioninfo_1.2.3           pkgbuild_1.4.6             
##  [23] RColorBrewer_1.1-3          abind_1.4-8                
##  [25] pkgload_1.4.0               zlibbioc_1.52.0            
##  [27] R.utils_2.12.3              ggraph_2.2.1               
##  [29] RCurl_1.98-1.16             yulab.utils_0.2.0          
##  [31] rappdirs_0.3.3              tweenr_2.0.3               
##  [33] GenomeInfoDbData_1.2.13     ggrepel_0.9.6              
##  [35] tidytree_0.4.6              reactome.db_1.89.0         
##  [37] codetools_0.2-20            DelayedArray_0.32.0        
##  [39] ggforce_0.4.2               shape_1.4.6.1              
##  [41] tidyselect_1.2.1            aplot_0.2.4                
##  [43] UCSC.utils_1.2.0            farver_2.1.2               
##  [45] viridis_0.6.5               matrixStats_1.5.0          
##  [47] GenomicAlignments_1.42.0    jsonlite_1.8.9             
##  [49] GetoptLong_1.0.5            ellipsis_0.3.2             
##  [51] tidygraph_1.3.1             iterators_1.0.14           
##  [53] foreach_1.5.2               tools_4.4.3                
##  [55] treeio_1.30.0               Rcpp_1.0.14                
##  [57] glue_1.8.0                  gridExtra_2.3              
##  [59] SparseArray_1.6.1           xfun_0.50                  
##  [61] qvalue_2.38.0               MatrixGenerics_1.18.1      
##  [63] withr_3.0.2                 fastmap_1.2.0              
##  [65] digest_0.6.37               timechange_0.3.0           
##  [67] R6_2.6.1                    mime_0.12                  
##  [69] gridGraphics_0.5-1          colorspace_2.1-1           
##  [71] GO.db_3.20.0                RSQLite_2.3.9              
##  [73] R.methodsS3_1.8.2           generics_0.1.3             
##  [75] data.table_1.16.4           rtracklayer_1.66.0         
##  [77] graphlayouts_1.2.2          httr_1.4.7                 
##  [79] htmlwidgets_1.6.4           S4Arrays_1.6.0             
##  [81] graphite_1.52.0             pkgconfig_2.0.3            
##  [83] gtable_0.3.6                blob_1.2.4                 
##  [85] XVector_0.46.0              htmltools_0.5.8.1          
##  [87] profvis_0.4.0               fgsea_1.32.2               
##  [89] clue_0.3-66                 ProtGenerics_1.38.0        
##  [91] scales_1.3.0                png_0.1-8                  
##  [93] ggfun_0.1.8                 knitr_1.49                 
##  [95] rstudioapi_0.17.1           tzdb_0.4.0                 
##  [97] reshape2_1.4.4              rjson_0.2.23               
##  [99] nlme_3.1-167                curl_6.2.0                 
## [101] GlobalOptions_0.1.2         cachem_1.1.0               
## [103] parallel_4.4.3              miniUI_0.1.1.1             
## [105] restfulr_0.0.15             pillar_1.10.1              
## [107] vctrs_0.6.5                 urlchecker_1.0.1           
## [109] promises_1.3.2              cluster_2.1.8              
## [111] xtable_1.8-4                Rgraphviz_2.50.0           
## [113] KEGGgraph_1.66.0            evaluate_1.0.3             
## [115] cli_3.6.4                   compiler_4.4.3             
## [117] Rsamtools_2.22.0            rlang_1.1.5                
## [119] crayon_1.5.3                plyr_1.8.9                 
## [121] fs_1.6.5                    stringi_1.8.4              
## [123] viridisLite_0.4.2           BiocParallel_1.40.0        
## [125] babelgene_22.9              munsell_0.5.1              
## [127] Biostrings_2.74.1           lazyeval_0.2.2             
## [129] Matrix_1.7-2                hms_1.1.3                  
## [131] patchwork_1.3.0             bit64_4.6.0-1              
## [133] shiny_1.10.0                SummarizedExperiment_1.36.0
## [135] memoise_2.0.1               bslib_0.9.0                
## [137] ggtree_3.14.0               fastmatch_1.1-6            
## [139] bit_4.5.0.1                 ape_5.8-1                  
## [141] gson_0.1.0